# Importing all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly as plotly
plotly.offline.init_notebook_mode()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
Here is a system of linear equations (or linear system) with three equations and three unknown variables:
4x - 3y + 1 = -10
2x + y + 3z = 0
-x + 2y -5z = 17
lhs = np.array([[4, -3, 1], [2, 1, 3], [-1, 2, -5]])
rhs = np.array([-10, 0, 17])
sol = np.linalg.solve(lhs, rhs)
print(sol)
[ 1. 4. -2.]
The goal of this project is to classify texts into Spam vs Not Spam using the Naive Bayes algorithms and comparing them to another type of classification model.
df = pd.read_csv('Lab2_dataset.csv')
df
| Unnamed: 0 | label | text | label_num | |
|---|---|---|---|---|
| 0 | 605 | ham | Subject: enron methanol ; meter # : 988291\nth... | 0 |
| 1 | 2349 | ham | Subject: hpl nom for january 9 , 2001\n( see a... | 0 |
| 2 | 3624 | ham | Subject: neon retreat\nho ho ho , we ' re arou... | 0 |
| 3 | 4685 | spam | Subject: photoshop , windows , office . cheap ... | 1 |
| 4 | 2030 | ham | Subject: re : indian springs\nthis deal is to ... | 0 |
| ... | ... | ... | ... | ... |
| 5166 | 1518 | ham | Subject: put the 10 on the ft\nthe transport v... | 0 |
| 5167 | 404 | ham | Subject: 3 / 4 / 2000 and following noms\nhpl ... | 0 |
| 5168 | 2933 | ham | Subject: calpine daily gas nomination\n>\n>\nj... | 0 |
| 5169 | 1409 | ham | Subject: industrial worksheets for august 2000... | 0 |
| 5170 | 4807 | spam | Subject: important online banking alert\ndear ... | 1 |
5171 rows × 4 columns
df.groupby('label').describe()
| Unnamed: 0 | label_num | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| label | ||||||||||||||||
| ham | 3672.0 | 1835.5 | 1060.159422 | 0.0 | 917.75 | 1835.5 | 2753.25 | 3671.0 | 3672.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| spam | 1499.0 | 4421.0 | 432.868340 | 3672.0 | 4046.50 | 4421.0 | 4795.50 | 5170.0 | 1499.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
from sklearn.feature_extraction.text import CountVectorizer
# create an instance of CountVectorizer
cv = CountVectorizer()
# fit the vectorizer to the data
X = cv.fit_transform(df['text'])
X
<5171x50447 sparse matrix of type '<class 'numpy.int64'>' with 456145 stored elements in Compressed Sparse Row format>
X.toarray()
array([[1, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=int64)
# split the data into features and target
X = X.toarray()
y = df['label_num']
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#print the shapes of the new X objects
print(X_train.shape)
print(X_test.shape)
(4136, 50447) (1035, 50447)
# RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
rfc_acc = accuracy_score(y_test, rfc_pred)
rfc_acc
0.9739130434782609
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_test)
gnb_acc = accuracy_score(y_test, gnb_pred)
gnb_acc
0.9545893719806763
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb_pred = mnb.predict(X_test)
mnb_acc = accuracy_score(y_test, mnb_pred)
print(mnb_acc)
0.978743961352657
Here's the comparison of the three models:
| Model | Accuracy Score |
|---|---|
| RandomForestClassifier | 0.9700 |
| GaussianNB | 0.9546 |
| MultinomialNB | 0.9787 |
The RandomForestClassifier, GaussianNB, and MultinomialNB models have performed quite well on the dataset, with accuracy scores of 0.9700, 0.9546, and 0.9787 respectively.
We can see that the Multinomial Naiive Baye's has the strongest accuracy score among all the three models. This might be due to the fact that the text data is well suited for the MultinomialNB model as it works well with discrete features like word counts.
The RandomForestClassifier is an ensemble learning method that operates by constructing multiple decision trees at training time and outputting the class that is the mode of the classes of the individual trees. This model has the advantage of reducing overfitting by averaging the result which can improve the predictive accuracy and control over-fitting.
GaussianNB implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian.
Using the AB_NYC_2019.csv dataset for this part. We want to:
Remove outliers based on price per night for a given apartment/home.
Compare the Z-score approach and the whiskers approach in terms of who is better to remove the outliers in this case.
and finally we will want to come up with a clean dataset that does not have outliers showcasing all the possibilities
df = pd.read_csv('AB_NYC_2019.csv')
df
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48890 | 36484665 | Charming one bedroom - newly renovated rowhouse | 8232441 | Sabrina | Brooklyn | Bedford-Stuyvesant | 40.67853 | -73.94995 | Private room | 70 | 2 | 0 | NaN | NaN | 2 | 9 |
| 48891 | 36485057 | Affordable room in Bushwick/East Williamsburg | 6570630 | Marisol | Brooklyn | Bushwick | 40.70184 | -73.93317 | Private room | 40 | 4 | 0 | NaN | NaN | 2 | 36 |
| 48892 | 36485431 | Sunny Studio at Historical Neighborhood | 23492952 | Ilgar & Aysel | Manhattan | Harlem | 40.81475 | -73.94867 | Entire home/apt | 115 | 10 | 0 | NaN | NaN | 1 | 27 |
| 48893 | 36485609 | 43rd St. Time Square-cozy single bed | 30985759 | Taz | Manhattan | Hell's Kitchen | 40.75751 | -73.99112 | Shared room | 55 | 1 | 0 | NaN | NaN | 6 | 2 |
| 48894 | 36487245 | Trendy duplex in the very heart of Hell's Kitchen | 68119814 | Christophe | Manhattan | Hell's Kitchen | 40.76404 | -73.98933 | Private room | 90 | 7 | 0 | NaN | NaN | 1 | 23 |
48895 rows × 16 columns
df.describe()
| id | host_id | latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.889500e+04 | 4.889500e+04 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 48895.000000 | 38843.000000 | 48895.000000 | 48895.000000 |
| mean | 1.901714e+07 | 6.762001e+07 | 40.728949 | -73.952170 | 152.720687 | 7.029962 | 23.274466 | 1.373221 | 7.143982 | 112.781327 |
| std | 1.098311e+07 | 7.861097e+07 | 0.054530 | 0.046157 | 240.154170 | 20.510550 | 44.550582 | 1.680442 | 32.952519 | 131.622289 |
| min | 2.539000e+03 | 2.438000e+03 | 40.499790 | -74.244420 | 0.000000 | 1.000000 | 0.000000 | 0.010000 | 1.000000 | 0.000000 |
| 25% | 9.471945e+06 | 7.822033e+06 | 40.690100 | -73.983070 | 69.000000 | 1.000000 | 1.000000 | 0.190000 | 1.000000 | 0.000000 |
| 50% | 1.967728e+07 | 3.079382e+07 | 40.723070 | -73.955680 | 106.000000 | 3.000000 | 5.000000 | 0.720000 | 1.000000 | 45.000000 |
| 75% | 2.915218e+07 | 1.074344e+08 | 40.763115 | -73.936275 | 175.000000 | 5.000000 | 24.000000 | 2.020000 | 2.000000 | 227.000000 |
| max | 3.648724e+07 | 2.743213e+08 | 40.913060 | -73.712990 | 10000.000000 | 1250.000000 | 629.000000 | 58.500000 | 327.000000 | 365.000000 |
# give me a table of the number of missing values in each column
df.isnull().sum()
id 0 name 16 host_id 0 host_name 21 neighbourhood_group 0 neighbourhood 0 latitude 0 longitude 0 room_type 0 price 0 minimum_nights 0 number_of_reviews 0 last_review 10052 reviews_per_month 10052 calculated_host_listings_count 0 availability_365 0 dtype: int64
We have some columns that
df.head()
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
Let us visualize the price column.
# visualize the distribution of the price column
plt.hist(df['price'], bins=100)
plt.show()
From the above histogram, we can see how right skewed the graph is, this already indicates that there are some very big values in this dataset compared to most of the other values.
# visualize the price column using a boxplot
plt.boxplot(df['price'])
plt.show()
Again, look at the box plot, we can barely even see our whiskers.
# visualize the distribution of the price column not including the outliers
plt.hist(df['price'][df['price'] < 1000], bins=100)
plt.show()
When we manually just inputted a random arbritrary number, we can already see how much difference this makes to our data.
Below, we will explore more standard methods of removing outliers.
from scipy.stats import zscore
# Calculate Z-scores
z_scores = zscore(df['price'])
# Get absolute Z-scores
abs_z_scores = np.abs(z_scores)
# Get columns where Z-score > 2
zscore_outliers = (abs_z_scores > 2)
print(f'Number of rows with an outlier value: {zscore_outliers.sum()}, Percentage of outliers: {zscore_outliers.sum() / len(df) * 100:.2f}%')
Number of columns with an outlier value: 767, Percentage of outliers: 1.57%
In the Z-Score method above, we are using a threshold of 2. What this means is that we will remove all the point sthat lie 2 standard deviations from the mean. We know that in theory, this should be about 5% of our data, but we have 1.57% here. This is because that rule only applies to a normally distributed data and ours isn't as we can see below.
import seaborn as sns
sns.displot(df, x = "price", kde=True, bins=50, aspect=2)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Density')
plt.show()
# Remove outliers
df_outliers_removed_with_zscore = df[~zscore_outliers]
df_outliers_removed_with_zscore
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48890 | 36484665 | Charming one bedroom - newly renovated rowhouse | 8232441 | Sabrina | Brooklyn | Bedford-Stuyvesant | 40.67853 | -73.94995 | Private room | 70 | 2 | 0 | NaN | NaN | 2 | 9 |
| 48891 | 36485057 | Affordable room in Bushwick/East Williamsburg | 6570630 | Marisol | Brooklyn | Bushwick | 40.70184 | -73.93317 | Private room | 40 | 4 | 0 | NaN | NaN | 2 | 36 |
| 48892 | 36485431 | Sunny Studio at Historical Neighborhood | 23492952 | Ilgar & Aysel | Manhattan | Harlem | 40.81475 | -73.94867 | Entire home/apt | 115 | 10 | 0 | NaN | NaN | 1 | 27 |
| 48893 | 36485609 | 43rd St. Time Square-cozy single bed | 30985759 | Taz | Manhattan | Hell's Kitchen | 40.75751 | -73.99112 | Shared room | 55 | 1 | 0 | NaN | NaN | 6 | 2 |
| 48894 | 36487245 | Trendy duplex in the very heart of Hell's Kitchen | 68119814 | Christophe | Manhattan | Hell's Kitchen | 40.76404 | -73.98933 | Private room | 90 | 7 | 0 | NaN | NaN | 1 | 23 |
48128 rows × 16 columns
# visualize the distribution of the price column
plt.hist(df_outliers_removed_with_zscore['price'], bins=100)
plt.show()
# visualize the price column using a boxplot
plt.boxplot(df_outliers_removed_with_zscore['price'])
plt.show()
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f'Number of outliers in the price column: {len(df[(df["price"] < lower_bound) | (df["price"] > upper_bound)])}, Percentage of outliers: {len(df[(df["price"] < lower_bound) | (df["price"] > upper_bound)]) / len(df) * 100:.2f}%')
df_outliers_removed_with_iqr = df[(df['price'] > lower_bound) & (df['price'] < upper_bound)]
df_outliers_removed_with_iqr
Number of outliers in the price column: 2972, Percentage of outliers: 6.08%
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539 | Clean & quiet apt home by the park | 2787 | John | Brooklyn | Kensington | 40.64749 | -73.97237 | Private room | 149 | 1 | 9 | 2018-10-19 | 0.21 | 6 | 365 |
| 1 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75362 | -73.98377 | Entire home/apt | 225 | 1 | 45 | 2019-05-21 | 0.38 | 2 | 355 |
| 2 | 3647 | THE VILLAGE OF HARLEM....NEW YORK ! | 4632 | Elisabeth | Manhattan | Harlem | 40.80902 | -73.94190 | Private room | 150 | 3 | 0 | NaN | NaN | 1 | 365 |
| 3 | 3831 | Cozy Entire Floor of Brownstone | 4869 | LisaRoxanne | Brooklyn | Clinton Hill | 40.68514 | -73.95976 | Entire home/apt | 89 | 1 | 270 | 2019-07-05 | 4.64 | 1 | 194 |
| 4 | 5022 | Entire Apt: Spacious Studio/Loft by central park | 7192 | Laura | Manhattan | East Harlem | 40.79851 | -73.94399 | Entire home/apt | 80 | 10 | 9 | 2018-11-19 | 0.10 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48890 | 36484665 | Charming one bedroom - newly renovated rowhouse | 8232441 | Sabrina | Brooklyn | Bedford-Stuyvesant | 40.67853 | -73.94995 | Private room | 70 | 2 | 0 | NaN | NaN | 2 | 9 |
| 48891 | 36485057 | Affordable room in Bushwick/East Williamsburg | 6570630 | Marisol | Brooklyn | Bushwick | 40.70184 | -73.93317 | Private room | 40 | 4 | 0 | NaN | NaN | 2 | 36 |
| 48892 | 36485431 | Sunny Studio at Historical Neighborhood | 23492952 | Ilgar & Aysel | Manhattan | Harlem | 40.81475 | -73.94867 | Entire home/apt | 115 | 10 | 0 | NaN | NaN | 1 | 27 |
| 48893 | 36485609 | 43rd St. Time Square-cozy single bed | 30985759 | Taz | Manhattan | Hell's Kitchen | 40.75751 | -73.99112 | Shared room | 55 | 1 | 0 | NaN | NaN | 6 | 2 |
| 48894 | 36487245 | Trendy duplex in the very heart of Hell's Kitchen | 68119814 | Christophe | Manhattan | Hell's Kitchen | 40.76404 | -73.98933 | Private room | 90 | 7 | 0 | NaN | NaN | 1 | 23 |
45918 rows × 16 columns
# visualize the distribution of the price column
plt.hist(df_outliers_removed_with_iqr['price'], bins=100)
plt.show()
# visualize the price column using a boxplot
plt.boxplot(df_outliers_removed_with_iqr['price'])
plt.show()
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
# Plot histogram of df['price']
axs[0].hist(df['price'], bins=100)
axs[0].set_title('Original')
axs[0].set_xlabel('Price')
axs[0].set_ylabel('Frequency')
# Plot histogram of df_outliers_removed_with_iqr
axs[1].hist(df_outliers_removed_with_iqr['price'], bins=100)
axs[1].set_title('IQR')
axs[1].set_xlabel('Price')
# Plot histogram of df_outliers_removed_with_zscore
axs[2].hist(df_outliers_removed_with_zscore['price'], bins=100)
axs[2].set_title('Z-score')
axs[2].set_xlabel('Price')
plt.show()
Looking at the grouped histogram above, you can see how our new data frames stack up in comparison to the original data. This is why we remove outliers.
Comparing just the two outlier removers side by side.
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
# Plot histogram of df_outliers_removed_with_iqr
axs[0].hist(df_outliers_removed_with_iqr['price'], bins=100)
axs[0].set_title('IQR')
axs[0].set_xlabel('Price')
axs[0].set_ylabel('Frequency')
# Plot histogram of df_outliers_removed_with_zscore
axs[1].hist(df_outliers_removed_with_zscore['price'], bins=100)
axs[1].set_title('Z-score')
axs[1].set_xlabel('Price')
plt.show()
We can see that the IQR method removed more columns than the Z-Score method.
Let's look at the boxplot.
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(15, 20), sharey=True)
# Plot histogram of df_outliers_removed_with_iqr
axs[0].boxplot(df_outliers_removed_with_iqr['price'])
axs[0].set_title('IQR')
axs[0].set_xlabel('Price')
# Plot histogram of df_outliers_removed_with_zscore
axs[1].boxplot(df_outliers_removed_with_zscore['price'])
axs[1].set_title('Z-score')
axs[1].set_xlabel('Price')
plt.show()
Placing them side by side here, we can see how much more outliers that the IQR method was able to take out.
Although, the IQR method will always take out more of your data than the zscore method. But because in this dataset, we have a lot of rows, the IQR method is preferred for removing outliers.